InĀ [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
InĀ [13]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target
df
Out[13]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422
... ... ... ... ... ... ... ... ... ...
20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 -121.09 0.781
20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 -121.21 0.771
20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 -121.22 0.923
20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 -121.32 0.847
20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 -121.24 0.894

20640 rows Ɨ 9 columns

InĀ [14]:
fig, axes = plt.subplots(3, 3, figsize=(10, 5))
for i, column in enumerate(df.columns):
    df[column].plot(kind='hist', ax=axes[i // 3, i % 3], title=f'Histogram of {column}', bins=30)
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [15]:
from sklearn.preprocessing import StandardScaler

df['AveRooms'] = StandardScaler().fit_transform(df['AveRooms'].values.reshape(-1, 1))
df['AveBedrms'] = StandardScaler().fit_transform(df['AveBedrms'].values.reshape(-1, 1))
df
Out[15]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal
0 8.3252 41.0 0.628559 -0.153758 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 0.327041 -0.263336 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 1.155620 -0.049016 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 0.156966 -0.049833 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 0.344711 -0.032906 565.0 2.181467 37.85 -122.25 3.422
... ... ... ... ... ... ... ... ... ...
20635 1.5603 25.0 -0.155023 0.077354 845.0 2.560606 39.48 -121.09 0.781
20636 2.5568 18.0 0.276881 0.462365 356.0 3.122807 39.49 -121.21 0.771
20637 1.7000 17.0 -0.090318 0.049414 1007.0 2.325635 39.43 -121.22 0.923
20638 1.8672 18.0 -0.040211 0.158778 741.0 2.123209 39.43 -121.32 0.847
20639 2.3886 16.0 -0.070443 0.138403 1387.0 2.616981 39.37 -121.24 0.894

20640 rows Ɨ 9 columns

InĀ [16]:
from sklearn.model_selection import train_test_split

y = df['MedHouseVal']
X = df.drop(['MedHouseVal'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape
Out[16]:
((16512, 8), (4128, 8))
InĀ [17]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
Out[17]:
0.5757877060324511
InĀ [18]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr.score(X_test, y_test)
Out[18]:
0.6184586450052418
InĀ [19]:
from sklearn.tree import plot_tree

plt.figure(figsize=(14, 7))
plot_tree(dtr, filled=True, feature_names=X_test.columns, max_depth=3, proportion=True, fontsize=8)
plt.show()
No description has been provided for this image
InĀ [20]:
import dtreeviz

viz_rmodel = dtreeviz.model(dtr, X_test, y_test, target_name='MedHouseVal', feature_names=X_test.columns)

viz_rmodel.view(depth_range_to_display=(1, 5), orientation='LR', scale=1.5)
x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names
x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\numpy\_core\fromnumeric.py:3596: RuntimeWarning: Mean of empty slice.
x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\numpy\_core\_methods.py:138: RuntimeWarning: invalid value encountered in scalar divide
Out[20]:
No description has been provided for this image
InĀ [44]:
import random
from pprint import pprint

best_choice = {}

for cnt in range(1, 600):
    dtr = DecisionTreeRegressor(max_depth=random.randint(1, 20),
                                min_samples_leaf=random.randint(1, 30),
                                max_features=random.randint(1, 8))
    dtr.fit(X_train, y_train)
    res_scores = dtr.score(X_test, y_test)
    ini_scores = dtr.score(X_train, y_train)
    if res_scores > best_choice.get('score', 0):
        print(f"Best Score: {res_scores:.6} | Initial Score: {ini_scores:.6}")
        best_choice['score'] = res_scores
        best_choice['model'] = dtr
print()
pprint(best_choice)
Best Score: 0.680757 | Initial Score: 0.770631
Best Score: 0.681313 | Initial Score: 0.900481
Best Score: 0.701296 | Initial Score: 0.840353
Best Score: 0.736389 | Initial Score: 0.812619
Best Score: 0.738296 | Initial Score: 0.841445
Best Score: 0.748956 | Initial Score: 0.882894
Best Score: 0.756807 | Initial Score: 0.82948

{'model': DecisionTreeRegressor(max_depth=13, max_features=5, min_samples_leaf=18),
 'score': 0.7568068658250048}